Code
library(tidyverse)
library(here)
library(scales)library(tidyverse)
library(here)
library(scales)here::i_am("analysis/Kish_HP2.qmd")
library(here)
df <- read.csv(here("data/Employee_Salaries.csv"))# Department
# Abbreivation/accronym for the department name. Example: ABS for the Alcohol Beverage Services department
# Department_Name
# the department name for all active, permanent employees of Montgomery County, MD (Maryland)
# Division
# The division for each department. That is, a single department can have multiple divisions and focuses.
# Example: ABS has the divisions that include those of ABS 85 Administrative Services
# and ABS 85 Beer Delivery Operations
# Gender
# the Gender of M and F. Assuming that since this is filings with the government, M is to be the legal sex of the individual M for Male and F for Female.
# Base_Salary
# precise base salaries (including down to the cent) for the individual, at the end of the calendar year.
# this is projected data. Should be accurate, but projection allows for greater flexibiliity.
# Overtime_Pay
# amount of overtime pay an employee/individual received for working beyond what they were required to do
# Longevity_Pay
# Longevity pay gives additional compensation for qualifying employees who have worked for a good deal of time. The longer time that is worked, the greater the longevity pay. Specifics about how long are dealt with in-house.
# Grade
# the standing an employee has within a company. The internal codes for all positions are not provided, but we can safely look at M1, M2, M3, etc. as those are clearly levels of management. There are thousands of grades.
# M4-Senior Manager
# M3-Middle Manager
# M2-Middle Manager
# M1-First Level Manageremployees of Montgomery County, MD paid in calendar year 2023
Total Compensation Additional Compensation no missing values using sex and not gender
df$Total_Compensation <- df$Base_Salary + df$Overtime_Pay + df$Longevity_Pay
df$Additional_Compensation <- df$Overtime_Pay + df$Longevity_Pay
df <- rename(df, Sex = "Gender")
# str(df)
# df <- df |> mutate(across(where(is.double), as.integer))
sum(is.na(df))[1] 0
# No NA ValuesHow many employees are there? How many of them get overtime, longevity, either, or both?
#each row is a unique employee
Total_Employees <- nrow(df)
Employees_Without_Base <- filter(df, Base_Salary == 0.00) |> nrow()
df <- mutate(df, Employee = row_number())
Employees_Without_Overtime <- filter(df, Overtime_Pay == 0.00)
#4,651 employees got no overtime
Employees_With_Overtime <- anti_join(df, Employees_Without_Overtime)
Percent_Overtime = (nrow(Employees_With_Overtime) / Total_Employees) * 100
# 54% ish employees have overtime
Employees_Without_Longevity <- filter(df, Longevity_Pay == 0.00)
#7,454 employees got no longevity pay
Employees_With_Longevity <- anti_join(df, Employees_Without_Longevity)
Percent_Longevity = (nrow(Employees_With_Longevity) / Total_Employees) * 100
# 27% is of employees have longevity pay
Employees_Without_Over_Long <- filter(df, Overtime_Pay == 0.00 | Longevity_Pay == 0.00)
# 3491 employees have no overtime or longevity pay
Employees_With_Over_Long <- anti_join(df, Employees_Without_Over_Long)
Percent_Over_Long <- (nrow(Employees_With_Over_Long) / Total_Employees) * 100
# 66% percent of employees have overtime or longevity pay
Employees_Without_Add_Comp <- filter(df, Additional_Compensation == 0.00)
# 3491 employees have no overtime or longevity pay
Employees_With_Add_Comp <- anti_join(df, Employees_Without_Add_Comp)
Percent_Add_Comp <- (nrow(Employees_With_Add_Comp) / Total_Employees) * 100
# 66% percent of employees have overtime or longevity payThere are 10291 employees total, where 0 employees have no base salary
Approximately 16.3% have overtime pay, 27.6% have longevity pay, and 16.3% have one or the either.
How much of their total compensation is base + overtime, longevity, either, or both (In terms of percentage?)
# addiing to the original df a new column, using mutate, that is the percent of total
# compensation that is each of the relevant factors.
df <- group_by(df, Employee) |>
mutate(Percent_Over = Overtime_Pay/Total_Compensation)
df$Percent_Over <- round(df$Percent_Over, 3) * 100
df <- group_by(df, Employee) |>
mutate(Percent_Long = Longevity_Pay/Total_Compensation)
df$Percent_Long <- round(df$Percent_Long, 3) * 100
df <- group_by(df, Employee) |>
mutate(Percent_Base = Base_Salary/Total_Compensation)
df$Percent_Base <- round(df$Percent_Base, 3) * 100
df <- group_by(df, Employee) |>
mutate(Percent_Add_Comp = Additional_Compensation/Total_Compensation)
df$Percent_Add_Comp <- round(df$Percent_Add_Comp, 3) * 100
df <- group_by(df, Employee) |>
mutate(Percent_Over_Long = Base_Salary/Total_Compensation)
df$Percent_Over_Long <- round(df$Percent_Over_Long, 3) * 100How much of their total compensation is base + overtime, longevity, either, or both?
#overall
p <- df |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Employees") +
geom_smooth(method = "lm")
pp2 <- Employees_With_Overtime |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Those with Overtime Pay") +
geom_smooth(method = "lm")
p2p3 <- Employees_With_Longevity |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Those with Longevity Pay") +
geom_smooth(method = "lm")
p3p4 <- Employees_With_Over_Long |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Those with Overtime or longevity pay") +
geom_smooth(method = "lm")
p4p5 <- Employees_With_Add_Comp |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All those with additional compensation") +
geom_smooth(method = "lm")
p5the coeff for these graphs
Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(df))
Over_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(Employees_With_Overtime))
Long_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(Employees_With_Longevity))
Over_Long_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(Employees_With_Over_Long))
Add_Comp_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(Employees_With_Add_Comp))
Base_Total_lm$coefficients(Intercept) Base_Salary
3269.028428 1.070269
Over_Base_Total_lm$coefficients (Intercept) Base_Salary
-11910.606577 1.330305
Long_Base_Total_lm$coefficients (Intercept) Base_Salary
-11115.562891 1.284514
Over_Long_Base_Total_lm$coefficients (Intercept) Base_Salary
-32010.085198 1.566415
Add_Comp_Base_Total_lm$coefficients (Intercept) Base_Salary
-4803.650406 1.219424
based on the coefficients for the graphs, those who only get base pay have their total compensation scale up the least with those choosing overtime and longevity pay scaling up the most.
Overtime pay scales slightly larger than longevity pay.
How much of their total compensation is base + overtime, longevity, either, or both (In terms of percentage?)
p6 <- df |>
group_by(Employee) |>
ggplot(mapping = aes(x = Percent_Base, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Percent Base", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All those with additional compensation") +
geom_smooth(method = "lm")
p6p7 <- df |>
group_by(Employee) |>
ggplot(mapping = aes(x = Percent_Long, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Percent Long", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All those with additional compensation") +
geom_smooth(method = "lm")
p7p8 <- df |>
group_by(Employee) |>
ggplot(mapping = aes(x = Percent_Over_Long, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Percent Over Long", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All those with additional compensation") +
geom_smooth(method = "lm")
p8p9 <- df |>
group_by(Employee) |>
ggplot(mapping = aes(x = Percent_Add_Comp, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Percent Add Comp", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All those with additional compensation") +
geom_smooth(method = "lm")
p9the coefficients of the percentage things
coef(lm(Total_Compensation ~ Percent_Base,
data = filter(df))) (Intercept) Percent_Base
245353.463 -1576.445
coef(lm(Total_Compensation ~ Percent_Long,
data = filter(df))) (Intercept) Percent_Long
94062.970 4668.756
coef(lm(Total_Compensation ~ Percent_Over_Long,
data = filter(df))) (Intercept) Percent_Over_Long
245353.463 -1576.445
coef(lm(Total_Compensation ~ Percent_Add_Comp,
data = filter(df))) (Intercept) Percent_Add_Comp
87709.012 1576.445
In terms of these slope coefficients, longevity pay has the greatest scaling.
Interestingly, the more percentage base pay you have, the less your total compensation
#overall
p <- df |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Employees") +
geom_smooth(method = "lm") +
facet_wrap(~Sex)
pp2 <- Employees_With_Overtime |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Those with Overtime Pay") +
geom_smooth(method = "lm") +
facet_wrap(~Sex)
p2p3 <- Employees_With_Longevity |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Those with Longevity Pay") +
geom_smooth(method = "lm") +
facet_wrap(~Sex)
p3p4 <- Employees_With_Over_Long |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Those with Overtime or longevity pay") +
geom_smooth(method = "lm") +
facet_wrap(~Sex)
p4p5 <- Employees_With_Add_Comp |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All those with additional compensation") +
geom_smooth(method = "lm") +
facet_wrap(~Sex)
p5the coeff for the data, on gender
M_Add_Comp_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(Employees_With_Add_Comp, Sex =="M"))
F_Add_Comp_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(Employees_With_Add_Comp, Sex == "F"))
M_Over_Long_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(Employees_With_Over_Long, Sex =="M"))
F_Over_Long_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(Employees_With_Over_Long, Sex == "F"))
M_Long_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(Employees_With_Longevity, Sex =="M"))
F_Long_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(Employees_With_Longevity, Sex == "F"))
M_Over_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(Employees_With_Overtime, Sex =="M"))
F_Over_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(Employees_With_Overtime, Sex == "F"))
M_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(df, Sex =="M"))
F_Base_Total_lm <- lm(Total_Compensation ~ Base_Salary,
data = filter(df, Sex == "F"))the coefficients of the percentage things, on gender
# Male Add Comp
coef(lm(Total_Compensation ~ Percent_Add_Comp,
data = filter(df, , Sex =="M"))) (Intercept) Percent_Add_Comp
89199.697 1577.999
# Female Add Comp
coef(lm(Total_Compensation ~ Percent_Add_Comp,
data = filter(df, , Sex =="F"))) (Intercept) Percent_Add_Comp
86761.238 1282.196
coef(lm(Total_Compensation ~ Percent_Over_Long,
data = filter(df, , Sex =="M"))) (Intercept) Percent_Over_Long
246999.627 -1577.999
coef(lm(Total_Compensation ~ Percent_Over_Long,
data = filter(df, , Sex =="F"))) (Intercept) Percent_Over_Long
214980.872 -1282.196
coef(lm(Total_Compensation ~ Percent_Base,
data = filter(df, , Sex =="M"))) (Intercept) Percent_Base
246999.627 -1577.999
coef(lm(Total_Compensation ~ Percent_Base,
data = filter(df, , Sex =="F"))) (Intercept) Percent_Base
214980.872 -1282.196
coef(lm(Total_Compensation ~ Percent_Long,
data = filter(df, , Sex =="M"))) (Intercept) Percent_Long
98329.223 5675.633
coef(lm(Total_Compensation ~ Percent_Long,
data = filter(df, , Sex =="F"))) (Intercept) Percent_Long
88868.458 2432.145
Management_1 <- filter(df, Grade == "M1")
Management_2 <- filter(df, Grade == "M2")
Management_3 <- filter(df, Grade == "M3")
Management_df <- bind_rows(Management_1, Management_2, Management_3)
Management_df |> filter(Sex =="M") |> nrow()[1] 214
Management_df |> filter(Sex =="F") |> nrow()[1] 232
Management_df |> filter(Grade == "M1") |> filter(Sex =="M") |> nrow()[1] 17
Management_df |> filter(Grade == "M2") |> filter(Sex =="M") |> nrow()[1] 73
Management_df |> filter(Grade == "M2") |> filter(Sex =="M") |> nrow()[1] 73
Management_df |> filter(Grade == "M1") |> filter(Sex =="F") |> nrow()[1] 11
Management_df |> filter(Grade == "M2") |> filter(Sex =="F") |> nrow()[1] 74
Management_df |> filter(Grade == "M2") |> filter(Sex =="F") |> nrow()[1] 74
p <- Management_df |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Employees") +
geom_smooth(method = "lm") +
facet_wrap(~ Grade)
pp2 <- Management_df |>
ggplot(mapping = aes(x = Base_Salary, y = Total_Compensation)) +
geom_point(aes(color = "red")) +
scale_x_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
scale_y_continuous(labels = label_number(suffix = " K", scale = 1e-3)) +
labs(x = "Base Salary", y = "Total Compensation", title = "Base Salary vs. Total Compensatory for All Employees") +
geom_smooth(method = "lm") +
facet_grid(rows = vars(Grade), cols = vars(Sex))
p2Management_df |>
group_by(Sex) |>
summarize(mean = mean(Base_Salary))# A tibble: 2 × 2
Sex mean
<chr> <dbl>
1 F 150932.
2 M 155995.
Management_df |>
group_by(Sex) |>
summarize(mean = mean(Total_Compensation))# A tibble: 2 × 2
Sex mean
<chr> <dbl>
1 F 150968.
2 M 157012.
df |>
group_by(Sex) |>
summarize(mean = mean(Base_Salary))# A tibble: 2 × 2
Sex mean
<chr> <dbl>
1 F 87498.
2 M 92383.
df |>
group_by(Sex) |>
summarize(mean = mean(Total_Compensation))# A tibble: 2 × 2
Sex mean
<chr> <dbl>
1 F 91559.
2 M 106084.
Management_df |>
filter(Sex == "M") |>
summarize(Total_Compensation)# A tibble: 214 × 2
Employee Total_Compensation
<int> <dbl>
1 1 175873
2 2 145613.
3 14 139407.
4 16 152632.
5 239 152940
6 364 175861.
7 366 137728
8 436 144632.
9 437 157272.
10 438 134497.
# ℹ 204 more rows
Management_df |>
filter(Sex == "F") |>
summarize(Total_Compensation)# A tibble: 232 × 2
Employee Total_Compensation
<int> <dbl>
1 3 136970
2 115 166140.
3 465 152940
4 471 152940
5 504 152940
6 521 177064.
7 536 178511.
8 537 175873
9 555 175873
10 579 152940
# ℹ 222 more rows